from bs4 import BeautifulSoup
import re
import os
import requests
import csv
import schedule
from request_utils import ask_url

# 网页url
BASE_URL = "https://movie.douban.com/chart"
CSV_PATH = "../北美票房榜.csv"

# 正则表达式
FIND_DAY = re.compile(r">(\d{1,2}月\d{1,2}日)\s*更新")
FIND_TITLE = re.compile(r'>([^<]+)</a>')
FIND_MONEY = re.compile(r'<span class="box_chart_num color-gray">(.*?)万</span>', re.S)


def main():
    """主函数：协调数据爬取、解析和存储流程"""
    # 网页爬取
    datalist, month = get_data(BASE_URL)
    savepath = "../北美票房榜.csv"

    # 保存数据
    save_data(datalist, savepath)
    print("爬取完毕")


def get_data(baseurl):
    """爬取网页数据"""
    datalist = []
    html = ask_url(baseurl)
    soup = BeautifulSoup(html, 'html.parser')

    month = ""
    for item in soup.find_all('span', class_="box_chart_num color-gray"):
        item = str(item)
        time = re.findall(FIND_DAY, item)
        if len(time) > 0:
            month = time[0]
            print(month)
            break

    number = 0
    for item in soup.find_all('li', class_="clearfix"):
        item = str(item)
        number += 1
        if number > 10:
            data = []
            title = re.findall(FIND_TITLE, item)[0]
            title = re.sub("\n", '', title)
            title = re.sub("                            ", '', title)
            title = re.sub("                        ", '', title)
            data.append(title)

            movie_money = re.findall(FIND_MONEY, item)[0]
            data.append(movie_money)

            datalist.append(data)
        else:
            continue

    print(datalist, month)
    return datalist, month


def save_data(datalist, savepath):
    """使用csv模块保存数据"""
    if os.path.exists(savepath):
        os.remove(savepath)

    with open(savepath, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(['电影名称', '电影票房'])
        for data in datalist:
            writer.writerow(data)
    print(f"数据已成功保存至 {savepath}")


if __name__ == '__main__':
    main()
    print("北美票房数据已获取")